{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pip install gffutils\n",
    "from collections import defaultdict\n",
    "\n",
    "import gffutils\n",
    "import sqlite3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -f ag.db"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    db = gffutils.create_db('https://vectorbase.org/common/downloads/Pre-VEuPathDB%20VectorBase%20files/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz', 'ag.db')\n",
    "except sqlite3.OperationalError:\n",
    "    db = gffutils.FeatureDB('ag.db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDS', 'RNase_P_RNA', 'SRP_RNA', 'contig', 'exon', 'five_prime_UTR', 'gene', 'mRNA', 'miRNA', 'misc_RNA', 'pseudogene', 'rRNA', 'snRNA', 'snoRNA', 'tRNA', 'tRNA_pseudogene', 'three_prime_UTR']\n",
      "CDS 62408\n",
      "RNase_P_RNA 1\n",
      "SRP_RNA 3\n",
      "contig 8\n",
      "exon 66485\n",
      "five_prime_UTR 10520\n",
      "gene 13624\n",
      "mRNA 14697\n",
      "miRNA 187\n",
      "misc_RNA 10\n",
      "pseudogene 5\n",
      "rRNA 53\n",
      "snRNA 38\n",
      "snoRNA 12\n",
      "tRNA 463\n",
      "tRNA_pseudogene 9\n",
      "three_prime_UTR 7281\n"
     ]
    }
   ],
   "source": [
    "print(list(db.featuretypes()))\n",
    "for feat_type in db.featuretypes():\n",
    "    print(feat_type, db.count_features_of_type(feat_type))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2L\tVectorBase\tcontig\t1\t49364325\t.\t.\t.\tID=2L;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n",
      "3R\tVectorBase\tcontig\t1\t53200684\t.\t.\t.\tID=3R;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n",
      "UNKN\tVectorBase\tcontig\t1\t42389979\t.\t.\t.\tID=UNKN;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n",
      "X\tVectorBase\tcontig\t1\t24393108\t.\t.\t.\tID=X;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n",
      "Y_unplaced\tVectorBase\tcontig\t1\t237045\t.\t.\t.\tID=Y_unplaced;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n",
      "Mt\tVectorBase\tcontig\t1\t15363\t.\t.\t.\tID=Mt;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n",
      "2R\tVectorBase\tcontig\t1\t61545105\t.\t.\t.\tID=2R;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n",
      "3L\tVectorBase\tcontig\t1\t41963435\t.\t.\t.\tID=3L;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal\n"
     ]
    }
   ],
   "source": [
    "for contig in db.features_of_type('contig'):\n",
    "    print(contig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "contig 2L, number of genes 3105\n",
      "contig 3R, number of genes 2763\n",
      "contig UNKN, number of genes 567\n",
      "contig X, number of genes 1097\n",
      "contig Y_unplaced, number of genes 0\n",
      "contig Mt, number of genes 37\n",
      "contig 2R, number of genes 3834\n",
      "contig 3L, number of genes 2221\n",
      "Max number of exons: AGAP001660 (67)\n",
      "Max span: AGAP006656 (365621)\n",
      "defaultdict(<class 'int'>, {2: 910, 1: 11595, 3: 211, 4: 74, 0: 781, 11: 3, 5: 27, 8: 4, 12: 1, 7: 5, 6: 9, 13: 1, 10: 1, 20: 1, 9: 1})\n",
      "defaultdict(<class 'int'>, {4: 2091, 2: 3359, 5: 1411, 6: 1039, 1: 2019, 3: 2838, 9: 419, 10: 298, 11: 202, 8: 454, 12: 159, 31: 5, 7: 718, 13: 106, 15: 65, 19: 28, 16: 45, 17: 53, 14: 65, 26: 3, 18: 22, 21: 9, 22: 7, 24: 6, 30: 5, 20: 19, 32: 1, 33: 1, 27: 2, 28: 5, 23: 6, 34: 1, 29: 4, 25: 9, 67: 1, 50: 1, 49: 1, 42: 1})\n"
     ]
    }
   ],
   "source": [
    "num_mRNAs = defaultdict(int)\n",
    "num_exons = defaultdict(int)\n",
    "max_exons = 0\n",
    "max_span = 0\n",
    "for contig in db.features_of_type('contig'):\n",
    "    cnt = 0\n",
    "    for gene in db.region((contig.seqid, contig.start, contig.end), featuretype='gene'):\n",
    "        cnt += 1\n",
    "        span = abs(gene.start - gene.end) # strand\n",
    "        if span > max_span:\n",
    "            max_span = span\n",
    "            max_span_gene = gene\n",
    "        my_mRNAs = list(db.children(gene, featuretype='mRNA'))\n",
    "        num_mRNAs[len(my_mRNAs)] += 1\n",
    "        if len(my_mRNAs) == 0:\n",
    "            exon_check = [gene]\n",
    "        else:\n",
    "            exon_check = my_mRNAs\n",
    "        for check in exon_check:\n",
    "            my_exons = list(db.children(check, featuretype='exon'))\n",
    "            num_exons[len(my_exons)] += 1\n",
    "            if len(my_exons) > max_exons:\n",
    "                max_exons = len(my_exons)\n",
    "                max_exons_gene = gene\n",
    "    print('contig %s, number of genes %d' % (contig.seqid, cnt))\n",
    "print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))\n",
    "print('Max span: %s (%d)' % (max_span_gene.id, max_span))\n",
    "print(num_mRNAs)\n",
    "print(num_exons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
